Loading the libraries¶

In [144]:
!pip install ydata_profiling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pandas_profiling import ProfileReport
Requirement already satisfied: ydata_profiling in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (4.0.0)
Requirement already satisfied: visions[type_image_path]==0.7.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.7.5)
Requirement already satisfied: seaborn<0.13,>=0.10.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.11.2)
Requirement already satisfied: typeguard<2.14,>=2.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.13.3)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (6.0)
Requirement already satisfied: matplotlib<3.7,>=3.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (3.5.2)
Requirement already satisfied: htmlmin==0.1.12 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.1.12)
Requirement already satisfied: scipy<1.10,>=1.4.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1)
Requirement already satisfied: statsmodels<0.14,>=0.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.13.2)
Requirement already satisfied: multimethod<1.10,>=1.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.11.3)
Requirement already satisfied: numpy<1.24,>=1.16.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.21.5)
Requirement already satisfied: pydantic<1.11,>=1.8.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.10.5)
Requirement already satisfied: tqdm<4.65,>=4.48.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (4.64.1)
Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.4.4)
Requirement already satisfied: phik<0.13,>=0.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.12.3)
Requirement already satisfied: requests<2.29,>=2.24.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.28.1)
Requirement already satisfied: networkx>=2.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (2.8.4)
Requirement already satisfied: attrs>=19.3.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (21.4.0)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (0.2.0)
Requirement already satisfied: imagehash in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (4.3.1)
Requirement already satisfied: Pillow in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (9.2.0)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from jinja2<3.2,>=2.11.1->ydata_profiling) (2.0.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (1.4.2)
Requirement already satisfied: fonttools>=4.22.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (4.25.0)
Requirement already satisfied: packaging>=20.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (3.0.9)
Requirement already satisfied: cycler>=0.10 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (0.11.0)
Requirement already satisfied: pytz>=2020.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata_profiling) (2022.1)
Requirement already satisfied: joblib>=0.14.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.1.0)
Requirement already satisfied: typing-extensions>=4.2.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pydantic<1.11,>=1.8.1->ydata_profiling) (4.3.0)
Requirement already satisfied: idna<4,>=2.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2022.9.24)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (1.26.11)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2.0.4)
Requirement already satisfied: patsy>=0.5.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from statsmodels<0.14,>=0.13.2->ydata_profiling) (0.5.2)
Requirement already satisfied: six in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels<0.14,>=0.13.2->ydata_profiling) (1.16.0)
Requirement already satisfied: PyWavelets in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imagehash->visions[type_image_path]==0.7.5->ydata_profiling) (1.3.0)

Loading dataset¶

In [116]:
# loading dataset 1 (adult.data)

df1 = pd.read_csv('adult_data.csv', header = None)
df1.shape
Out[116]:
(32561, 15)
In [117]:
# loading dataset 2 (adult.test)

df2 = pd.read_csv('adult_test.csv', header = None)
df2.shape
Out[117]:
(16281, 15)
In [118]:
# combining 2 datasets

df = pd.concat([df1, df2], ignore_index = True)
In [119]:
# shape of the original dataset after combining

df.shape
Out[119]:
(48842, 15)
In [120]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship','race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week',
                'country', 'target income']
df.columns = column_names
df.head()
Out[120]:
age workclass fnlwgt education education-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week country target income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

Running EDA¶

In [121]:
EDA = ProfileReport(df, title = "EDA of the Adult Dataset", html={'style':{'full_width': True}})
In [122]:
EDA
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[122]:

In [123]:
df.describe().T
Out[123]:
count mean std min 25% 50% 75% max
age 48842.0 38.643585 13.710510 17.0 28.0 37.0 48.0 90.0
fnlwgt 48842.0 189664.134597 105604.025423 12285.0 117550.5 178144.5 237642.0 1490400.0
education-num 48842.0 10.078089 2.570973 1.0 9.0 10.0 12.0 16.0
capital-gain 48842.0 1079.067626 7452.019058 0.0 0.0 0.0 0.0 99999.0
capital-loss 48842.0 87.502314 403.004552 0.0 0.0 0.0 0.0 4356.0
hours-per-week 48842.0 40.422382 12.391444 1.0 40.0 40.0 45.0 99.0

Removing Missing values (full dataset)¶

In [88]:
# df.workclass.value_counts()
# df.occupation.value_counts()
# df.country.value_counts()
In [124]:
df_modified = df.copy()
In [125]:
df_modified.shape
Out[125]:
(48842, 15)
In [ ]:
 
In [126]:
# First: Identifying missing values in workclass, occupation and country
# df[df.occupation.str.contains(pat='?', case=False, regex=False)]
# df[df.workclass.str.contains(pat='?', case=False, regex=False)]
# df[df.country.str.contains(pat='?', case=False, regex=False)]

# removing the rows containign '?' in any of the columns - workclass, occupation and country
# and modifying the dataset

df_modified = df_modified[df_modified.occupation.str.contains(pat='?', case=False, regex=False) == False]
df_modified = df_modified[df_modified.workclass.str.contains(pat='?', case=False, regex=False) == False]
df_modified = df_modified[df_modified.country.str.contains(pat='?', case=False, regex=False) == False]
In [127]:
# shape of the dataset after removing '?'

df_modified.shape
Out[127]:
(45222, 15)
In [128]:
# checking if all the "?" are actually removed

df_modified.workclass.value_counts()
# df.occupation.value_counts()
# df.country.value_counts()
Out[128]:
 Private             33307
 Self-emp-not-inc     3796
 Local-gov            3100
 State-gov            1946
 Self-emp-inc         1646
 Federal-gov          1406
 Without-pay            21
Name: workclass, dtype: int64

Feature Engineering¶

In [129]:
print(df_modified.education.value_counts())
 HS-grad         14783
 Some-college     9899
 Bachelors        7570
 Masters          2514
 Assoc-voc        1959
 11th             1619
 Assoc-acdm       1507
 10th             1223
 7th-8th           823
 Prof-school       785
 9th               676
 12th              577
 Doctorate         544
 5th-6th           449
 1st-4th           222
 Preschool          72
Name: education, dtype: int64
In [130]:
df_modified['education'] = df_modified['education'].str.replace('11th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('9th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('7th-8th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('5th-6th', 'Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('10th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('1st-4th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Preschool','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('12th','Dropout', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Bachelors','Bachelors', regex = True)
df_modified['education'] = df_modified['education'].str.replace('HS-grad','High Grad', regex = True)
df_modified['education'] = df_modified['education'].replace(['Masters', 'Prof-school'],'Masters', regex = True)
df_modified['education'] = df_modified['education'].replace(['Some-college','Assoc-acdm', 'Assoc-voc'],'Community College', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Doctorate','Doctorate', regex = True)
df_modified['education'].unique()
Out[130]:
array([' Bachelors', ' High Grad', ' Dropout', ' Masters',
       ' Community College', ' Doctorate'], dtype=object)
In [131]:
print(df_modified.education.value_counts())
 High Grad            14783
 Community College    13365
 Bachelors             7570
 Dropout               5661
 Masters               3299
 Doctorate              544
Name: education, dtype: int64
In [132]:
print(df_modified['marital-status'].value_counts())
 Married-civ-spouse       21055
 Never-married            14598
 Divorced                  6297
 Separated                 1411
 Widowed                   1277
 Married-spouse-absent      552
 Married-AF-spouse           32
Name: marital-status, dtype: int64
In [133]:
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Never-married','Unmarried', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-civ-spouse','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Divorced','Separated', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-spouse-absent','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Separated','Separated', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Married-AF-spouse','Married', regex = True)
df_modified['marital-status'] = df_modified['marital-status'].str.replace('Widowed','Widowed', regex = True)
df_modified['marital-status'].unique()
Out[133]:
array([' Unmarried', ' Married', ' Separated', ' Widowed'], dtype=object)
In [134]:
print(df_modified['marital-status'].value_counts())
 Married      21639
 Unmarried    14598
 Separated     7708
 Widowed       1277
Name: marital-status, dtype: int64
In [135]:
print(df_modified['target income'].value_counts())
 <=50K     22654
 <=50K.    11360
 >50K       7508
 >50K.      3700
Name: target income, dtype: int64
In [136]:
df_modified['target income'] = df_modified['target income'].str.replace('<=50K.', '<=50K',regex=False)
df_modified['target income'] = df_modified['target income'].str.replace('>50K.', '>50K', regex=False)
df_modified['target income'].unique()
Out[136]:
array([' <=50K', ' >50K'], dtype=object)
In [137]:
print(df_modified['target income'].value_counts())
 <=50K    34014
 >50K     11208
Name: target income, dtype: int64

Dropping duplicates¶

In [138]:
df_modified = df_modified.drop_duplicates()
In [139]:
# shape of the dataset after dropping duplicates

df_modified.shape
Out[139]:
(45175, 15)

Comparing EDA of 2 datasets¶

In [140]:
EDA_modified = ProfileReport(df_modified, title="EDA of modified dataset")
comparison_report = EDA.compare(EDA_modified)
comparison_report.to_file("original_vs_transformed.html")
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]
In [141]:
comparison_report
Out[141]:

In [ ]:
 
In [ ]:
 

Class Imbalance¶

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: